{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "import re\n",
    "import numpy as np\n",
    "from sklearn.utils import shuffle\n",
    "from utils import *\n",
    "import time\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from unidecode import unidecode\n",
    "from tqdm import tqdm\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                               text\n",
       "0  Negative  Lebih-lebih lagi dengan  kemudahan internet da...\n",
       "1  Positive  boleh memberi teguran kepada parti tetapi perl...\n",
       "2  Negative  Adalah membingungkan mengapa masyarakat Cina b...\n",
       "3  Positive  Kami menurunkan defisit daripada 6.7 peratus p...\n",
       "4  Negative        Ini masalahnya. Bukan rakyat, tetapi sistem"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('sentiment-news-bahasa-v5.csv')\n",
    "Y = LabelEncoder().fit_transform(df.label)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textcleaning(string):\n",
    "    string = re.sub('http\\S+|www.\\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))\n",
    "    string = unidecode(string).replace('.', '. ').replace(',', ', ')\n",
    "    string = re.sub('[^\\'\\\"A-Za-z\\- ]+', ' ', string)\n",
    "    return ' '.join([i for i in re.findall(\"[\\\\w']+|[;:\\-\\(\\)&.,!?\\\"]\", string) if len(i)>1]).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(df.shape[0]):\n",
    "    df.iloc[i,1] = textcleaning(df.iloc[i,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('polarity-negative-translated.txt','r') as fopen:\n",
    "    texts = fopen.read().split('\\n')\n",
    "labels = [0] * len(texts)\n",
    "\n",
    "with open('polarity-positive-translated.txt','r') as fopen:\n",
    "    positive_texts = fopen.read().split('\\n')\n",
    "labels += [1] * len(positive_texts)\n",
    "texts += positive_texts\n",
    "texts += df.iloc[:,1].tolist()\n",
    "labels += Y.tolist()\n",
    "\n",
    "assert len(labels) == len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 18957\n",
      "Most common words [('yang', 14891), ('dan', 8177), ('tidak', 4578), ('untuk', 4023), ('dengan', 3349), ('filem', 3279)]\n",
      "Sample data [1631, 204, 5, 161, 218, 106, 303, 4, 78, 202] ['ringkas', 'bodoh', 'dan', 'membosankan', 'kanak-kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(texts).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def str_idx(corpus, dic, maxlen, UNK=3):\n",
    "    X = np.zeros((len(corpus),maxlen))\n",
    "    for i in range(len(corpus)):\n",
    "        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):\n",
    "            try:\n",
    "                X[i,-1 - no]=dic[k]\n",
    "            except Exception as e:\n",
    "                X[i,-1 - no]=UNK\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def attention(inputs, attention_size):\n",
    "    hidden_size = inputs.shape[2].value\n",
    "    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))\n",
    "    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))\n",
    "    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))\n",
    "    with tf.name_scope('v'):\n",
    "        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)\n",
    "    vu = tf.tensordot(v, u_omega, axes=1, name='vu')\n",
    "    alphas = tf.nn.softmax(vu, name='alphas')\n",
    "    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)\n",
    "    return output, alphas\n",
    "\n",
    "class Model:\n",
    "    def __init__(self, size_layer, num_layers, dropout, dimension_output, learning_rate, maxlen,\n",
    "                dict_size):\n",
    "        def cells(size, reuse=False):\n",
    "            return tf.contrib.rnn.DropoutWrapper(\n",
    "                tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse),\n",
    "                state_keep_prob=dropout,\n",
    "                output_keep_prob=dropout)\n",
    "        \n",
    "        self.X = tf.placeholder(tf.int32, [None, None])\n",
    "        self.Y = tf.placeholder(tf.int32, [None])\n",
    "        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, size_layer], -1, 1))\n",
    "        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)\n",
    "        \n",
    "        for n in range(num_layers):\n",
    "            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(\n",
    "                cell_fw = cells(size_layer),\n",
    "                cell_bw = cells(size_layer),\n",
    "                inputs = encoder_embedded,\n",
    "                dtype = tf.float32,\n",
    "                scope = 'bidirectional_rnn_%d'%(n))\n",
    "            encoder_embedded = tf.concat((out_fw, out_bw), 2)\n",
    "        self.outputs, self.attention = attention(encoder_embedded,maxlen)\n",
    "        W = tf.get_variable('w',shape=(size_layer*2, 2),initializer=tf.orthogonal_initializer())\n",
    "        b = tf.get_variable('b',shape=(2),initializer=tf.zeros_initializer())\n",
    "        self.logits = tf.add(tf.matmul(self.outputs, W),b,name='logits')\n",
    "        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, \n",
    "                                                                           labels = self.Y))\n",
    "        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(self.logits, self.Y, 1), tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hierarchical/model.ckpt'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "size_layer = 256\n",
    "num_layers = 2\n",
    "dropout = 0.8\n",
    "dimension_output = 2\n",
    "learning_rate = 1e-4\n",
    "batch_size = 32\n",
    "maxlen = 100\n",
    "\n",
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(size_layer,num_layers,dropout,dimension_output,learning_rate,maxlen,len(dictionary))\n",
    "sess.run(tf.global_variables_initializer())\n",
    "saver = tf.train.Saver(tf.global_variables())\n",
    "saver.save(sess, \"hierarchical/model.ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = str_idx(texts, dictionary, maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, \n",
    "                                                    labels,\n",
    "                                                    test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.806, cost=0.572]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.70it/s, accuracy=0.625, cost=0.613]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, pass acc: 0.000000, current acc: 0.623950\n",
      "time taken: 92.23545956611633\n",
      "epoch: 0, training loss: 0.673939, training acc: 0.574175, valid loss: 0.655924, valid acc: 0.623950\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.839, cost=0.486]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 12.00it/s, accuracy=0.75, cost=0.584] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 1, pass acc: 0.623950, current acc: 0.661064\n",
      "time taken: 91.80484056472778\n",
      "epoch: 1, training loss: 0.616633, training acc: 0.657256, valid loss: 0.627682, valid acc: 0.661064\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.806, cost=0.44] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.85it/s, accuracy=0.875, cost=0.616]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 2, pass acc: 0.661064, current acc: 0.684524\n",
      "time taken: 91.81486558914185\n",
      "epoch: 2, training loss: 0.564194, training acc: 0.705840, valid loss: 0.621890, valid acc: 0.684524\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.839, cost=0.375]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.99it/s, accuracy=0.75, cost=0.665] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 3, pass acc: 0.684524, current acc: 0.689426\n",
      "time taken: 91.83554458618164\n",
      "epoch: 3, training loss: 0.513168, training acc: 0.745062, valid loss: 0.627222, valid acc: 0.689426\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.21it/s, accuracy=0.903, cost=0.293]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.84it/s, accuracy=0.75, cost=0.741] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 4, pass acc: 0.689426, current acc: 0.690476\n",
      "time taken: 92.32510042190552\n",
      "epoch: 4, training loss: 0.458082, training acc: 0.783936, valid loss: 0.649062, valid acc: 0.690476\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.20it/s, accuracy=0.871, cost=0.274]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.91it/s, accuracy=0.5, cost=0.904]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 5, pass acc: 0.690476, current acc: 0.692927\n",
      "time taken: 92.5005795955658\n",
      "epoch: 5, training loss: 0.403465, training acc: 0.818688, valid loss: 0.687502, valid acc: 0.692927\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.21it/s, accuracy=0.935, cost=0.185]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.73it/s, accuracy=0.5, cost=1.11]   \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 6, pass acc: 0.692927, current acc: 0.696779\n",
      "time taken: 92.46164631843567\n",
      "epoch: 6, training loss: 0.350288, training acc: 0.845744, valid loss: 0.746539, valid acc: 0.696779\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.23it/s, accuracy=0.968, cost=0.166]\n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.84it/s, accuracy=0.5, cost=1.35]   \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 7, pass acc: 0.696779, current acc: 0.700980\n",
      "time taken: 92.025390625\n",
      "epoch: 7, training loss: 0.300940, training acc: 0.872798, valid loss: 0.799389, valid acc: 0.700980\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.935, cost=0.158] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.77it/s, accuracy=0.375, cost=1.85] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 91.94457387924194\n",
      "epoch: 8, training loss: 0.261112, training acc: 0.889165, valid loss: 0.833415, valid acc: 0.693978\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.935, cost=0.139] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.71it/s, accuracy=0.375, cost=1.67] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 92.27956342697144\n",
      "epoch: 9, training loss: 0.232353, training acc: 0.903960, valid loss: 0.856125, valid acc: 0.688725\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.968, cost=0.119] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.82it/s, accuracy=0.5, cost=1.58]   \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 92.19576978683472\n",
      "epoch: 10, training loss: 0.194886, training acc: 0.920508, valid loss: 0.904724, valid acc: 0.695028\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.24it/s, accuracy=0.935, cost=0.115] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.85it/s, accuracy=0.375, cost=2.56] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 91.8014886379242\n",
      "epoch: 11, training loss: 0.160236, training acc: 0.938189, valid loss: 1.080001, valid acc: 0.689426\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 357/357 [01:24<00:00,  4.22it/s, accuracy=0.903, cost=0.106] \n",
      "test minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.87it/s, accuracy=0.375, cost=2.3]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 92.0934100151062\n",
      "epoch: 12, training loss: 0.141676, training acc: 0.944489, valid loss: 1.072974, valid acc: 0.697479\n",
      "\n",
      "break epoch:13\n",
      "\n"
     ]
    }
   ],
   "source": [
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0\n",
    "while True:\n",
    "    lasttime = time.time()\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:%d\\n'%(EPOCH))\n",
    "        break\n",
    "        \n",
    "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
    "    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')\n",
    "    for i in pbar:\n",
    "        batch_x = train_X[i:min(i+batch_size,train_X.shape[0])]\n",
    "        batch_y = train_Y[i:min(i+batch_size,train_X.shape[0])]\n",
    "        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
    "        assert not np.isnan(loss)\n",
    "        train_loss += loss\n",
    "        train_acc += acc\n",
    "        pbar.set_postfix(cost=loss, accuracy = acc)\n",
    "    \n",
    "    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')\n",
    "    for i in pbar:\n",
    "        batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]\n",
    "        batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]\n",
    "        acc, loss = sess.run([model.accuracy, model.cost], \n",
    "                           feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
    "        test_loss += loss\n",
    "        test_acc += acc\n",
    "        pbar.set_postfix(cost=loss, accuracy = acc)\n",
    "    \n",
    "    train_loss /= (len(train_X) / batch_size)\n",
    "    train_acc /= (len(train_X) / batch_size)\n",
    "    test_loss /= (len(test_X) / batch_size)\n",
    "    test_acc /= (len(test_X) / batch_size)\n",
    "    \n",
    "    if test_acc > CURRENT_ACC:\n",
    "        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))\n",
    "        CURRENT_ACC = test_acc\n",
    "        CURRENT_CHECKPOINT = 0\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "        \n",
    "    print('time taken:', time.time()-lasttime)\n",
    "    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'%(EPOCH,train_loss,\n",
    "                                                                                          train_acc,test_loss,\n",
    "                                                                                          test_acc))\n",
    "    EPOCH += 1\n",
    "    saver.save(sess, \"hierarchical/model.ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "validation minibatch loop: 100%|██████████| 90/90 [00:07<00:00, 11.92it/s]\n"
     ]
    }
   ],
   "source": [
    "real_Y, predict_Y = [], []\n",
    "\n",
    "pbar = tqdm(range(0, len(test_X), batch_size), desc='validation minibatch loop')\n",
    "for i in pbar:\n",
    "    batch_x = test_X[i:min(i+batch_size,test_X.shape[0])]\n",
    "    batch_y = test_Y[i:min(i+batch_size,test_X.shape[0])]\n",
    "    predict_Y += np.argmax(sess.run(model.logits, feed_dict = {model.X : batch_x, model.Y : batch_y}),1).tolist()\n",
    "    real_Y += batch_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.65      0.69      0.67      1289\n",
      "   positive       0.73      0.70      0.71      1567\n",
      "\n",
      "avg / total       0.70      0.69      0.69      2856\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if \"Variable\" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0 or n.name.find('alphas') == 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def freeze_graph(model_dir, output_node_names):\n",
    "\n",
    "    if not tf.gfile.Exists(model_dir):\n",
    "        raise AssertionError(\n",
    "            \"Export directory doesn't exists. Please specify an export \"\n",
    "            \"directory: %s\" % model_dir)\n",
    "\n",
    "    checkpoint = tf.train.get_checkpoint_state(model_dir)\n",
    "    input_checkpoint = checkpoint.model_checkpoint_path\n",
    "    \n",
    "    absolute_model_dir = \"/\".join(input_checkpoint.split('/')[:-1])\n",
    "    output_graph = absolute_model_dir + \"/frozen_model.pb\"\n",
    "    clear_devices = True\n",
    "    with tf.Session(graph=tf.Graph()) as sess:\n",
    "        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)\n",
    "        saver.restore(sess, input_checkpoint)\n",
    "        output_graph_def = tf.graph_util.convert_variables_to_constants(\n",
    "            sess,\n",
    "            tf.get_default_graph().as_graph_def(),\n",
    "            output_node_names.split(\",\")\n",
    "        ) \n",
    "        with tf.gfile.GFile(output_graph, \"wb\") as f:\n",
    "            f.write(output_graph_def.SerializeToString())\n",
    "        print(\"%d ops in the final graph.\" % len(output_graph_def.node))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Restoring parameters from hierarchical/model.ckpt\n",
      "INFO:tensorflow:Froze 44 variables.\n",
      "Converted 44 variables to const ops.\n",
      "793 ops in the final graph.\n"
     ]
    }
   ],
   "source": [
    "freeze_graph(\"hierarchical\", strings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_graph(frozen_graph_filename):\n",
    "    with tf.gfile.GFile(frozen_graph_filename, \"rb\") as f:\n",
    "        graph_def = tf.GraphDef()\n",
    "        graph_def.ParseFromString(f.read())\n",
    "    with tf.Graph().as_default() as graph:\n",
    "        tf.import_graph_def(graph_def)\n",
    "    return graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "g=load_graph('hierarchical/frozen_model.pb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 100)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = g.get_tensor_by_name('import/Placeholder:0')\n",
    "logits = g.get_tensor_by_name('import/logits:0')\n",
    "alphas = g.get_tensor_by_name('import/alphas:0')\n",
    "test_sess = tf.InteractiveSession(graph=g)\n",
    "test_sess.run([logits,alphas], feed_dict={x:vectors[:1]})[1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[0.25555208, 0.7444479 ]], dtype=float32),\n",
       " array([[0.00209559, 0.00378773, 0.01576839, 0.02166901, 0.0758793 ,\n",
       "         0.15169376, 0.29709268, 0.29184714, 0.14016648]], dtype=float32)]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'\n",
    "new_vector = str_idx([text],dictionary,len(text.split()))\n",
    "test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[0.16736251, 0.8326375 ]], dtype=float32),\n",
       " array([[0.0156941 , 0.06104115, 0.11414091, 0.18187664, 0.62724715]],\n",
       "       dtype=float32)]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = 'saya sangat sayangkan kerajaan saya'\n",
    "new_vector = str_idx([text],dictionary,len(text.split()))\n",
    "test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[0.5802783 , 0.41972172]], dtype=float32),\n",
       " array([[0.16799149, 0.28151396, 0.2692253 , 0.2812692 ]], dtype=float32)]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = 'bodoh lah awak ni'\n",
    "new_vector = str_idx([text],dictionary,len(text.split()))\n",
    "test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[0.3700046, 0.6299954]], dtype=float32),\n",
       " array([[0.16361861, 0.15003377, 0.34092915, 0.34541845]], dtype=float32)]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = 'kerajaan sebenarnya sangat baik'\n",
    "new_vector = str_idx([text],dictionary,len(text.split()))\n",
    "test_sess.run([tf.nn.softmax(logits),alphas], feed_dict={x:new_vector})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open('hierarchical-sentiment.json','w') as fopen:\n",
    "    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
